rm(list = ls())

#Load packages
library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(plyr)
library(dotwhisker)
library(lmtest)
library(xtable)
library(sandwich)

#set working directory
setwd("~/Dropbox/Platform Enforcement/v2/00_source_data")

#read in data
data <- read.csv("ReplicationDataset.csv")

fb <- subset(data, data$platform=="Facebook")
twitter <- subset(data, data$platform=="Twitter")

####
####
#### SUMMARY STATS
####
####

#number of URLs by platform
table(data$platform)

#this dataset only includes live posts
#proportion of posts labeled by platform
prop.table(table(twitter$status_label))
prop.table(table(fb$status_label))

####
####
#### PLATFORM VARIANCE WITHIN TICKETS
####
####

#unique tickets (i.e. narratives)
unique_tickets <- as.data.frame(table(data$ticket_platform)) #81 unique ticket-platforms

#calculate enforcement variance by platform 
url_analysis_ticket <- ddply(data, .(ticket_platform), summarise,
                             estimate = var(status_label))

fb_url_analysis_ticket <- ddply(fb, .(ticket_platform), summarise,
                                estimate = var(status_label))

twitter_url_analysis_ticket <- ddply(twitter, .(ticket_platform), summarise,
                                     estimate = var(status_label))

#create a platform variable
url_analysis_ticket$platform <- url_analysis_ticket$ticket_platform
url_analysis_ticket$platform <- substring(url_analysis_ticket$platform, 9)

#create a variable that takes zero if variance is zero, and one otherwise
url_analysis_ticket$variance_binary <- url_analysis_ticket$estimate
url_analysis_ticket$variance_binary[url_analysis_ticket$variance_binary > 0] <- 1

#table 1: make table showing any variance in labeling enforcement by platform
x <- prop.table(table(url_analysis_ticket$variance_binary, url_analysis_ticket$platform),2)
rownames(x) <- c("No labeling variance","Some labeling variance")
xtable(x, digits=2, caption="Portion of narratives with and without any variance in labeling by platform.
       Approximately two-thirds of narratives had at least some labeling variance on both Facebook and Twitter.")

#calculate ticket variance by platform 
url_analysis_ticket <- ddply(data, .(platform, ticket), summarise,
                             var_label = var(status_label),
                             mean_label = mean(status_label),
                             count_label = sum(status_label))

url_analysis_ticket_x <- ddply(data, .(platform, ticket), nrow)

####
####
#### VERIFIED AND LABELED
####
####

#relationship between being verified and post labeled, by platform
#standard errors clustered at ticket level
reg_ver_twitter <- lm(status_label ~ user_verified, data=twitter)
reg_ver_twitter_cl <- coeftest(reg_ver_twitter, vcov = vcovCL, cluster = ~ticket)

reg_ver_fb <- lm(status_label ~ user_verified, data=fb)
reg_ver_fb_cl <- coeftest(reg_ver_fb, vcov = vcovCL, cluster = ~ticket)

####
####
#### LANGUAGE AND LABELED
####
####

#relationship between language and content labeled, by platform
#standard errors clustered at ticket level
reg_lang_twitter <- lm(status_label ~ english, data=twitter)
reg_lang_twitter_cl <- coeftest(reg_lang_twitter, vcov = vcovCL, cluster = ~ticket)

reg_lang_fb <- lm(status_label ~ english, data=fb)
reg_lang_fb_cl <- coeftest(reg_lang_fb, vcov = vcovCL, cluster = ~ticket)

####
####
#### CONTENT TYPE AND LABELED
####
####

#relationship between content type and content labeled, by platform
#standard errors clustered at ticket level
reg_type_twitter <- lm(status_label ~ content_type, data=twitter)
reg_type_twitter_cl <- coeftest(reg_type_twitter, vcov = vcovCL, cluster = ~ticket)

reg_type_fb <- lm(status_label ~ content_type, data=fb)
reg_type_fb_cl <- coeftest(reg_type_fb, vcov = vcovCL, cluster = ~ticket)

####
####
#### URL SHARED AND LABELED
####
####

#relationship between URL shared and content labeled, by platform
#standard errors clustered at ticket level
reg_url_twitter <- lm(status_label ~ url_shared, data=twitter)
reg_url_twitter_cl <- coeftest(reg_url_twitter, vcov = vcovCL, cluster = ~ticket)

reg_url_fb <- lm(status_label ~ url_shared, data=fb)
reg_url_fb_cl <- coeftest(reg_url_fb, vcov = vcovCL, cluster = ~ticket)

####
####
#### IMAGE IN POST AND LABELED
####
####

#relationship between image in post and content labeled, by platform
#standard errors clustered at ticket level
reg_image_twitter <- lm(status_label ~ image_in_post, data=twitter)
reg_image_twitter_cl <- coeftest(reg_image_twitter, vcov = vcovCL, cluster = ~ticket)

reg_image_fb <- lm(status_label ~ image_in_post, data=fb)
reg_image_fb_cl <- coeftest(reg_image_fb, vcov = vcovCL, cluster = ~ticket)

####
####
#### VIDEO IN POST AND LABELED
####
####

#relationship between image in post and content labeled, by platform
#standard errors clustered at ticket level
reg_video_twitter <- lm(status_label ~ video_in_post, data=twitter)
reg_video_twitter_cl <- coeftest(reg_video_twitter, vcov = vcovCL, cluster = ~ticket)

reg_video_fb <- lm(status_label ~ video_in_post, data=fb)
reg_video_fb_cl <- coeftest(reg_video_fb, vcov = vcovCL, cluster = ~ticket)

####
####
#### PLOT COMBINING REGRESSIONS
####
####

reg_ver_twitter_cl <-
  broom::tidy(reg_ver_twitter_cl) %>% filter(term != "(Intercept)") %>% mutate(model = "Twitter")
reg_ver_fb_cl<-
  broom::tidy(reg_ver_fb_cl) %>% filter(term != "(Intercept)") %>% mutate(model = "Facebook")
reg_lang_twitter_cl <-
  broom::tidy(reg_lang_twitter_cl) %>% filter(term != "(Intercept)") %>% mutate(model = "Twitter")
reg_lang_fb_cl <-
  broom::tidy(reg_lang_fb_cl) %>% filter(term != "(Intercept)") %>% mutate(model = "Facebook")
reg_url_twitter_cl <-
  broom::tidy(reg_url_twitter_cl) %>% filter(term != "(Intercept)") %>% mutate(model = "Twitter")
reg_url_fb_cl <-
  broom::tidy(reg_url_fb_cl) %>% filter(term != "(Intercept)") %>% mutate(model = "Facebook")
reg_image_twitter_cl <-
  broom::tidy(reg_image_twitter_cl) %>% filter(term != "(Intercept)") %>% mutate(model = "Twitter")
reg_image_fb_cl <-
  broom::tidy(reg_image_fb_cl) %>% filter(term != "(Intercept)") %>% mutate(model = "Facebook")
reg_video_twitter_cl <-
  broom::tidy(reg_video_twitter_cl) %>% filter(term != "(Intercept)") %>% mutate(model = "Twitter")
reg_video_fb_cl <-
  broom::tidy(reg_video_fb_cl) %>% filter(term != "(Intercept)") %>% mutate(model = "Facebook")

all_models <- rbind(reg_ver_twitter_cl,reg_ver_fb_cl,reg_lang_twitter_cl,reg_lang_fb_cl,
                    reg_url_twitter_cl, reg_url_fb_cl,reg_image_twitter_cl,reg_image_fb_cl,
                    reg_video_twitter_cl,reg_video_fb_cl  )

#relabel predictors
all_models <- all_models %>%                                     
  relabel_predictors(c(
    user_verified = "User verified",
    english = "Post in English",
    url_shared = "URL shared in post",
    image_in_post = "Image shared in post",
    video_in_post = "Video shared in post"
  ))

#convert to dataframe
all_models <- as.data.frame(all_models)

#make plot - Figure 3
pdf(file="~/Dropbox/Platform Enforcement/v2/20_results/dwplot.pdf", width=12)
dwplot(all_models, show_intercept = TRUE,
       dot_args = list(size = 3.2),
       whisker_args = list(size = 1),
       vline = geom_vline(xintercept = 0, colour = "grey60", linetype = 2))+
  theme_bw() +
  xlab("Coefficient estimate for post labeled") +
  xlim(-.75,.75) +
  theme(text = element_text(size=30))
dev.off()

